Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
Keep DE genes
datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)
print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"
Using Biweight midcorrelation because it’s more robust to outliers than regular correlation or Mutual Information score
allowWGCNAThreads()
## Allowing multi-threading with up to 4 threads.
# MAD = median absolute deviation
cor_mat = datExpr %>% t %>% bicor
clusterings = list()
Average linkage seems to be better at separating outliers than identifying modules
dend = cor_mat %>% as.dist %>% hclust(method='average')
plot(dend, hang = 0, labels=FALSE)
dend %>% as.dendrogram %>% set('labels', rep('', nrow(cor_mat))) %>% plot(ylim=c(-0.001,0.05))
dend = cor_mat %>% as.dist %>% hclust
plot(dend, hang = 0, labels=FALSE)
dend %>% as.dendrogram %>% set('labels', rep('', nrow(cor_mat))) %>% plot(ylim=c(0.5,1))
abline(h=0.75, col='blue')
best_k = 27
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id
clusterings[['cor_mat']] = modules
Merge similar modules
module_colors = viridis(best_k)
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
Merging Modules 4 and 19, and 8, 13, 15, 21, 22, 24, 27
bicor(MEs) %>% as.matrix %>% heatmap(scale='none',
col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))
new_modules = modules %>% replace(modules==19, 4) %>% replace(modules %in% c(13, 15, 21, 22, 24, 27), 8)
names(new_modules) = datGenes$feature_id
clusterings[['cor_mat_merged']] = new_modules
rm(best_k, modules, new_modules, dend, MEs_output, MEs)
Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\). Two methods are proposed: \(s_{ij}=|bw(i,j)|\) and \(s_{ij}=\frac{1+bw(i,j)}{2}\)
-Using \(s_{ij}=\frac{1+bw(i,j)}{2}\), the strongest negative correlations (-1) get mapped to 0 (no correlation) and the zero correlated genes get mapped to the average correlation (0.5), which I don’t think makes much sense
-Using \(s_{ij}=|bw(i,j)|\) we lose the direction of the correlation, but at least we maintain the magnitude of the correlation of all the genes. Decided to use this one
S = abs(cor_mat)
dend = S %>% as.dist %>% hclust
plot(dend, hang = 0, labels=FALSE)
abline(h=0.775, col='blue')
best_k = 21
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id
clusterings[['S']] = modules
Merge similar modules
module_colors = viridis(best_k)
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
Merging Modules (1, 2, 10 and 14), (6, 7, 16 and 19), (3 and 13), and (4 and 20)
bicor(MEs) %>% as.matrix %>% heatmap(scale='none',
col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))
new_modules = modules %>% replace(modules %in% c(2, 10, 14), 1) %>%
replace(modules %in% c(7, 16, 19), 6) %>%
replace(modules==13, 3) %>% replace(modules==20, 4)
names(new_modules) = datGenes$feature_id
clusterings[['S_merged']] = new_modules
rm(cor_mat, best_k, modules, module_colors, MEs_output, MEs, new_modules)
Sigmoid function: \(a(i,j)=sigmoid(s_{ij}, \alpha, \tau_0) \equiv \frac{1}{1+e^{-\alpha(s_{ij}-\tau_0)}}\)
Power adjacency function: \(a(i,j)=power(s_{ij}, \beta) \equiv |S_{ij}|^\beta\)
Chose power adjacency function over the sigmoid function because it has only one parameter to adjust and both methods are supposed to lead to very similar results if the parameters are chosen with the scale-free topology criterion.
Following the scale-free topology criterion because metabolic networks have been found to display approximate scale free topology
best_power = datExpr %>% t %>% pickSoftThreshold(powerVector = 1:15, RsquaredCut=0.8)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.00342 0.261 0.988 775.000 770.0000 1330.00
## 2 2 0.24500 -1.490 0.986 257.000 247.0000 636.00
## 3 3 0.58100 -2.220 0.963 98.200 90.3000 331.00
## 4 4 0.74900 -2.510 0.990 41.600 36.6000 186.00
## 5 5 0.83600 -2.560 0.990 19.200 15.8000 110.00
## 6 6 0.87300 -2.590 0.994 9.460 7.2500 67.80
## 7 7 0.90000 -2.520 0.994 4.960 3.4900 43.50
## 8 8 0.90800 -2.440 0.988 2.740 1.7600 28.80
## 9 9 0.88700 -2.380 0.951 1.590 0.9140 19.70
## 10 10 0.39100 -3.380 0.358 0.960 0.4870 13.70
## 11 11 0.38500 -3.210 0.349 0.603 0.2660 9.78
## 12 12 0.93500 -1.980 0.982 0.391 0.1490 7.11
## 13 13 0.95700 -1.830 0.985 0.262 0.0850 5.25
## 14 14 0.93300 -1.770 0.941 0.180 0.0495 3.98
## 15 15 0.94200 -1.780 0.964 0.126 0.0295 3.36
print(paste0('Best power for scale free topology: ', best_power$powerEstimate))
## [1] "Best power for scale free topology: 5"
Elevate the matrix to the suggested power
S_sft = S^best_power$powerEstimate
dend = S_sft %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)
abline(h=0.28, col='blue')
best_k = 21
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id
clusterings[['S_sft']] = modules
Merge similar modules
module_colors = viridis(best_k)
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
Merging Modules (1, 2, 10 and 14), (6, 7, 16 and 19), (3 and 13), and (4 and 20) (same as with S)
bicor(MEs) %>% as.matrix %>% heatmap(scale='none',
col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))
new_modules = modules %>% replace(modules %in% c(2, 10, 14), 1) %>%
replace(modules %in% c(7, 16, 19), 6) %>%
replace(modules==13, 3) %>% replace(modules==20, 4)
names(new_modules) = datGenes$feature_id
clusterings[['S_sft_merged']] = new_modules
rm(dend, best_k, modules, module_colors, MEs_output, MEs, new_modules)
Additional considerations
What does high mean? using the power=5 we get a mean connectivity of 20, is this high?
mean_connectivity = matrix(0, 15, 2) %>% data.frame
colnames(mean_connectivity) = c('power','mean_connectivity')
for(i in 1:15) mean_connectivity[i,] = c(i, mean(colSums(S^i)))
ggplotly(mean_connectivity %>% ggplot(aes(power, mean_connectivity)) + ylab('mean connectivity') +
geom_vline(xintercept=best_power$powerEstimate, color='gray') + geom_point(color='#0099cc') +
scale_y_sqrt() + theme_minimal())
rm(mean_connectivity, best_power)
Slope is 2.6 times as steep as it should be (maybe because the range of k is too narrow?)
k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))
lm(log10(p_k) ~ log10(k), data=k_df)
##
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
##
## Coefficients:
## (Intercept) log10(k)
## 2.687 -2.670
rm(k_df)
Note: The slope is very steep, both axis are on sqrt scale
plot_data = data.frame('n'=1:nrow(S)^2, 'S'=sort(melt(S)$value), 'S_sft'=sort(melt(S_sft)$value))
plot_data %>% filter(n%%100==0) %>% dplyr::select(S, S_sft) %>% melt %>%
ggplot(aes(value, color=variable, fill=variable)) + geom_density(alpha=0.5) +
xlab('k') + ylab('p(k)') + scale_x_sqrt() + scale_y_sqrt() + theme_minimal()
rm(plot_data)
The example given in the article has a curve similar to this one and they say it’s OK
k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))
# k_df %>% ggplot(aes(k,p_k)) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') +
# ylab('p(k)') + scale_x_log10() + scale_y_log10() + theme_minimal()
k_df %>% ggplot(aes(log10(k),log10(p_k))) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') +
ylab('p(k)') + theme_minimal()
\(R^2=0.83\) The \(R^2\) we got from the pickSoftThreshold function
lm(log10(p_k) ~ log10(k), data=k_df) %>% summary
##
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6508 -0.3561 0.1792 0.2841 0.4629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6865 0.6685 4.019 0.003848 **
## log10(k) -2.6703 0.3955 -6.751 0.000145 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4222 on 8 degrees of freedom
## Multiple R-squared: 0.8507, Adjusted R-squared: 0.832
## F-statistic: 45.58 on 1 and 8 DF, p-value: 0.0001449
rm(k_df)
Using topological overlap dissimilarity measure because it has been found to result in biologically meaningful modules
1st quartile is already 0.9852, most of the genes are very dissimilar
TOM = S_sft %>% TOMsimilarity
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
rownames(dissTOM) = rownames(S_sft)
colnames(dissTOM) = colnames(S_sft)
dissTOM %>% melt %>% summary
## Var1 Var2 value
## ENSG00000000971: 3000 ENSG00000000971: 3000 Min. :0.0000
## ENSG00000001084: 3000 ENSG00000001084: 3000 1st Qu.:0.9852
## ENSG00000001626: 3000 ENSG00000001626: 3000 Median :0.9918
## ENSG00000002586: 3000 ENSG00000002586: 3000 Mean :0.9882
## ENSG00000002746: 3000 ENSG00000002746: 3000 3rd Qu.:0.9956
## ENSG00000002933: 3000 ENSG00000002933: 3000 Max. :0.9999
## (Other) :8982000 (Other) :8982000
rownames(TOM) = rownames(S_sft)
colnames(TOM) = colnames(S_sft)
dend = TOM %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)
abline(h=0.14, col='blue')
best_k = 19
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id
clusterings[['TOM']] = modules
Merge similar modules
module_colors = viridis(best_k)
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
Merging Modules (2, 3, 4 and 8)
bicor(MEs) %>% as.matrix %>% heatmap(scale='none',
col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))
new_modules = modules %>% replace(modules %in% c(3, 4, 8), 2)
names(new_modules) = datGenes$feature_id
clusterings[['TOM_merged']] = new_modules
rm(S, S_sft, TOM, dend, best_k, modules, module_colors, MEs_output, MEs, new_modules)
Using hierarchical clustering on the TOM-based dissimilarity matrix
In average linkage hierarchical clustering, the distance between two clusters is defined as the average distance between each point in one cluster to every point in the other cluster.
It’s not easy to determine which number of clusters is the optimal
dend = dissTOM %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)
Zooming in on the root of the dendrogram you can see that it just separates outliers and it only separates the data into two big groups until farther down. And even after this, it looks like it continues to filter out outliers almost one by one. I don’t think this is ideal…
dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(0.99,1))
In complete linkage hierarchical clustering, the distance between two clusters is defined as the longest distance between two points in each cluster.
It’s not easy to see where you should do the cut
dend = dissTOM %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)
Zooming in on the beginning of the dendrogram it seems like maybe 14 is a good number of clusters? Although its quite arbitrary…
dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(0.999,1))
abline(h=0.99943, col='blue')
Looks balanced…
best_k = 23
dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% set('branches_k_color', k=best_k) %>% plot
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id
clusterings[['dissTOM']] = modules
table(modules)
## modules
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 205 72 297 107 208 153 96 2 267 228 326 69 102 191 147 75 27 113
## 19 20 21 22 23
## 80 57 46 19 113
“One can relate modules to each other by correlating the corresponding module eigengenes (Horvath et al., 2005). If two modules are highly correlated, one may want to merge them”
Calculate the “eigengenes” (1st principal component) of each module
module_colors = viridis(best_k)
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
Merging Modules 3, 5, 6, 9, 10, 11 and 14
bicor(MEs) %>% as.matrix %>% heatmap(scale='none',
col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))
new_modules = modules %>% replace(modules %in% c(5,6,9,10,11,14), 3)
names(new_modules) = datGenes$feature_id
clusterings[['dissTOM_merged']] = new_modules
viridis_colors = viridis(best_k)
dend_colors = data.frame('ID'=names(modules[labels(dend)]),
'OriginalModules' = viridis_colors[as.vector(modules[labels(dend)])],
'MergedModules' = viridis_colors[as.vector(new_modules[labels(dend)])])
dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_colors[,-1])
rm(new_modules, MEs, dend, best_k, modules, module_colors, MEs_output, MEs, new_modules, dend_colors, viridis_colors)
## Warning in rm(new_modules, MEs, dend, best_k, modules, module_colors,
## MEs_output, : object 'MEs' not found
## Warning in rm(new_modules, MEs, dend, best_k, modules, module_colors,
## MEs_output, : object 'new_modules' not found
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(8,8))
rm(i, j, cluster1, cluster2, cluster_sim)
pca = datExpr %>% prcomp
plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
'cor_mat' = clusterings[['cor_mat']], 'cor_mat_merged' = clusterings[['cor_mat_merged']],
'S' = clusterings[['S']], 'S_merged' = clusterings[['S_merged']],
'S_sft' = clusterings[['S_sft']], 'S_sft_merged' = clusterings[['S_sft_merged']],
'TOM' = clusterings[['TOM']], 'TOM_merged' = clusterings[['TOM_merged']],
'dissTOM' = clusterings[['dissTOM']], 'dissTOM_merged' = clusterings[['dissTOM_merged']])
selectable_scatter_plot(plot_data[,-1], plot_data[,-1])
#ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=OriginalModules)) + geom_point(alpha=0.5) + theme_minimal())
rm(pca, plot_data)